


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
<!--<![endif]-->

<head>
  <meta charset="utf-8">
  <meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>基本介绍 &mdash; HAI Platform  documentation</title>
  

  <link rel="shortcut icon" href="../_static/images/logo192.png" />
  
  

  

  
  
  

  

  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <!-- <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> -->
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/element-plus.css" type="text/css" />
  <link rel="index" title="Index" href="../genindex.html" />
  <link rel="search" title="Search" href="../search.html" />
  <link rel="next" title="安装与设置" href="install.html" />
  <link rel="prev" title="欢迎来到 HAI Platform 官方文档" href="../index.html" />
  <!-- Google Analytics -->
  <script type="text/javascript">
    var collapsedSections = [];
  </script>
  
  <!-- End Google Analytics -->
  

  
  <script src="../_static/js/modernizr.min.js"></script>
  <script>
    MathJax = {
        chtml: {
            scale: 1,
            minScale: 1,
        },
        svg: {
            scale: 1,
            minScale: 1,
        }
    }
</script>

  <!-- Preload the theme fonts -->

<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">

<!-- Preload the katex fonts -->

<link rel="preload" href="../_static/external/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/external/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
  <link rel="stylesheet" href="../_static/external/all.css"
    integrity="sha384-vSIIfh2YWi9wW0r9iZe7RJPrKwp6bG+s9QZMoITbCckVJqGCCRhc+ccxNcdpHuYu" crossorigin="anonymous">
</head>

<div class="container-fluid header-holder tutorials-header" id="header-holder">
  <div class="container">
    <div class="header-container">
      <a class="header-logo" href=""
        aria-label="OpenMMLab"></a>

      <div class="main-menu">
        <ul>
        </ul>
      </div>

      <!-- <a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a> -->
    </div>
  </div>
</div>

<body class="pytorch-body">

   

  

  <div class="table-of-contents-link-wrapper">
    <span>Table of Contents</span>
    <a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a>
  </div>

  <nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu">
    <div class="pytorch-side-scroll">
      <div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
        <div class="pytorch-left-menu-search">
          

          
          
          
          

          



<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search Docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        
        
        
        
        
        
        <p class="caption" role="heading"><span class="caption-text">开始使用</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="current reference internal" href="#">基本介绍</a></li>
<li class="toctree-l1"><a class="reference internal" href="install.html">安装与设置</a></li>
<li class="toctree-l1"><a class="reference internal" href="studio.html">Studio</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">用户须知</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../guide/tutorial.html">命令行工具 (CLI)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../guide/environment.html">环境配置</a></li>
<li class="toctree-l1"><a class="reference internal" href="../guide/schedule.html">分时调度</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">CLI 说明</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../cli/user.html">用户命令</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cli/exec.html">运行命令</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cli/task.html">任务命令</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cli/cluster.html">集群命令</a></li>
<li class="toctree-l1"><a class="reference internal" href="../cli/ugc.html">UGC 命令</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">API 说明</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../api/hai.html">hfai</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/client.html">hfai.client</a></li>
<li class="toctree-l1"><a class="reference internal" href="../api/client_remote.html">hfai.client.remote</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">其他</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../misc/env_var.html">环境变量</a></li>
<li class="toctree-l1"><a class="reference internal" href="../misc/resources.html">外部资源</a></li>
</ul>

        
        
      </div>
    </div>
  </nav>

  <div class="pytorch-container">
    <div class="pytorch-page-level-bar" id="pytorch-page-level-bar">
      <div class="pytorch-breadcrumbs-wrapper">
        















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="pytorch-breadcrumbs">
    
      <li>
        <a href="../index.html">
            Docs
        </a> &gt;
      </li>

        
      <li>基本介绍</li>
    
    
      <li class="pytorch-breadcrumbs-aside">
        
            
            <a href="../_sources/start/hai_intro.md.txt" rel="nofollow"><img src="../_static/images/view-page-source-icon.svg"></a>
          
        
      </li>
    
  </ul>

  
</div>
      </div>

      <div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper">
        Shortcuts
      </div>
    </div>

    <section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap">
      <div class="pytorch-content-left">
        
          <div class="rst-content">
            
            <div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article">
              <article itemprop="articleBody" id="pytorch-article" class="pytorch-article">
                
  <section class="tex2jax_ignore mathjax_ignore" id="id1">
<h1>基本介绍<a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h1>
<p>构建一个能管理大规模 GPU 集群资源的 AI 平台，主要会遇到如下几个痛点问题：</p>
<ul class="simple">
<li><p><strong>资源调度</strong>：算力规模不断扩大，而训练任务的计算需求又多种多样，如何处理任务和算力的关系以最大化集群资源使用？</p></li>
<li><p><strong>使用效率</strong>：集群使用有峰谷差异，同时又要应对突发任务的需求，如何兼顾时效性和集群整体效率？</p></li>
<li><p><strong>迭代适配</strong>：集群会有升级迭代，在调整算力规模、类型、使用规则等场景下，如何让平台快速适配，尽可能降低切换成本？</p></li>
</ul>
<p>针对上述问题，幻方 AI 在自建的萤火集群上进行多年的研发与测试，积累了<strong>一套高效管理 GPU 集群资源的 AI 训练平台方案，名叫 HAI Platform</strong>。其<strong>以任务级分时调度共享 AI 算力的理念将集群零散资源进行整合再分配</strong>，成功支持在 1500+ 计算节点上稳定运行深度学习训练和其他多类型任务，日常算力占用率 95% 以上，日常 GPU 使用率 75% 以上，计算和存储节点间的数据吞吐 7TB/s 以上。</p>
<p>HAI Platform 可以部署在您的私有集群中，帮助您高效利用 GPU 集群资源，提升团队整体研发效率。</p>
<section id="id2">
<h2>产品架构<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h2>
<p>HAI Platform 主要由8个模块组成，可以分为：</p>
<ul class="simple">
<li><p>交互及接口：hub, haproxy, server</p></li>
<li><p>任务调度和管理：scheduler, launcher, manager, k8swatcher</p></li>
<li><p>监控及固化：dbs, monitor</p></li>
</ul>
<p>整体架构如下图所示：</p>
<p><img alt="" src="../_images/platform_arch.png" /></p>
<p>这些模块互相协作形成服务主体，基本的流程是：</p>
<ol class="arabic simple">
<li><p>用户请求经由 hub、haproxy 到达 server 处理 api 响应，任务信息写入数据库；</p></li>
<li><p>scheduler 定期轮询数据库中未完成的任务，按调度规则选出可启动的任务交给 launcher；</p></li>
<li><p>launcher 负责为任务创建 k8s 对象，包括每个任务的 manager 和执行 pod；</p></li>
<li><p>k8swatcher 为服务请求提供缓存。</p></li>
</ol>
</section>
<section id="id3">
<h2>核心概念<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h2>
<section id="id4">
<h3>资源管理<a class="headerlink" href="#id4" title="Permalink to this headline">¶</a></h3>
<section id="id5">
<h4>计算资源<a class="headerlink" href="#id5" title="Permalink to this headline">¶</a></h4>
<p>平台记录所有接入集群的算力节点，按资源类型（cpu，gpu等）、网络区域等条件分类标记，节点是算力资源分配的基本单位。</p>
<p>部署 HAI Platform 的集群不会将 GPU 资源池化，而是<strong>以计算节点为单位，鼓励用户所提交的任务一次性用满多张 GPU，进行并行训练</strong>。
用户提交任务时需选定节点数量 n，则该任务可获得 n 的整数倍个 GPU，比如在8卡计算节点上提交使用4个节点的任务，则该任务会获得32张 GPU 进行并行训练。每个任务<strong>至少使用 1 个节点</strong>。</p>
</section>
<section id="id6">
<h4>存储资源<a class="headerlink" href="#id6" title="Permalink to this headline">¶</a></h4>
<p>平台及所有算力节点需要连接<strong>集中存储</strong>，管理文件内容包括：</p>
<ul class="simple">
<li><p>运行时需要的配置文件、启动脚本</p></li>
<li><p>服务日志</p></li>
<li><p>pgsql、redis等持久化文件目录</p></li>
<li><p>用户工作区文件目录，包括代码、配置文件、数据文件、任务运行日志等</p></li>
<li><p>虚拟环境文件目录</p></li>
<li><p>数据集</p></li>
</ul>
<p>通常在分布式训练过程中，数据集需要通过高速文件系统读取，以提高训练效率，因此建议根据容量和性能需求搭建多个文件系统，并合理设计分配流量。</p>
</section>
</section>
<section id="id7">
<h3>任务调度<a class="headerlink" href="#id7" title="Permalink to this headline">¶</a></h3>
<section id="id8">
<h4>任务<a class="headerlink" href="#id8" title="Permalink to this headline">¶</a></h4>
<p>HAI Platform 的用户使用集群资源都是以<strong>任务</strong>的方式提交给平台，例如提交运行python代码、运行bash代码、启动开发容器等。</p>
<p><strong>任务是 HAI Platform 使用的基本单位，而非用户</strong>。所有用户提交的任务都由 HAI Platform 统一管理和调度。</p>
</section>
<section id="id9">
<h4>分时调度<a class="headerlink" href="#id9" title="Permalink to this headline">¶</a></h4>
<p>部署 HAI Platform 的集群以<strong>分时调度</strong>的方式为任务分配计算节点，根据当前资源需求、集群忙闲程度等进行任务的中断和加载，<strong>任务代码需要遵循平台编码规则以确保可以断点续跑</strong>，具体包括：</p>
<ul class="simple">
<li><p>接受集群的打断信号；</p></li>
<li><p>保存 checkpoint（模型参数，优化器参数等）；</p></li>
<li><p>通知集群打断；</p></li>
<li><p>从 checkpoint 恢复，继续运行。</p></li>
</ul>
<p>具体操作请阅读<a class="reference internal" href="../guide/schedule.html"><span class="doc std std-doc">分时调度</span></a>篇章内容。</p>
</section>
</section>
<section id="id10">
<h3>用户管理<a class="headerlink" href="#id10" title="Permalink to this headline">¶</a></h3>
<section id="id11">
<h4>用户配额<a class="headerlink" href="#id11" title="Permalink to this headline">¶</a></h4>
<p>HAI Platform 以<strong>配额</strong>的方式记录集群的各类资源，通过<strong>优先级</strong>管理用户的使用权限，例如：某用户在 NORMAL 优先级上有10个 GPU 节点的配额。其在 HAI Platform 上提交若干个任务，则最多同时以 NORMAL 优先级调度10个计算节点运行。
算力资源优先满足高优先级用户的训练需求。同优先级内交替使用集群算力资源。</p>
<p><a class="reference internal" href="../guide/schedule.html"><span class="doc std std-doc"><img alt="" src="../_images/schedule_04.png" /></span></a></p>
</section>
<section id="id12">
<h4>用户组<a class="headerlink" href="#id12" title="Permalink to this headline">¶</a></h4>
<p>每个用户都属于一个用户组，资源和优先级配额按用户分配，同组用户可以共享虚拟环境、私有数据集。</p>
</section>
<section id="id13">
<h4>管理员权限<a class="headerlink" href="#id13" title="Permalink to this headline">¶</a></h4>
<p>管理员身份的用户有权限执行用户创建、停用、归档等操作，调整各类资源及优先级配额，后台管理开发容器等。</p>
</section>
</section>
<section id="id14">
<h3>工作区<a class="headerlink" href="#id14" title="Permalink to this headline">¶</a></h3>
<p>用户使用 HAI Platform 进行 AI 训练，需要先完成代码、数据、环境等迁移到集群。平台按用户、用户组在集群建立和管理工作区、数据集仓库。</p>
<p>工作区是用户在集群上自主管理的存储空间，有以下使用场景：</p>
<ol class="arabic simple">
<li><p>用户使用集群开发容器，已挂载工作区路径，在终端命令行访问；</p></li>
<li><p>用户在本地环境（集群外），使用平台提供的客户端工具与集群侧工作去目录同步、文件传输；</p></li>
<li><p>用户提交任务运行，任务容器挂载工作区路径，执行工作区路径下的代码。</p></li>
</ol>
</section>
<section id="id15">
<h3>环境管理<a class="headerlink" href="#id15" title="Permalink to this headline">¶</a></h3>
<p>部署 HAI Platform 的集群会包含<strong>基础环境</strong>和用户<strong>自定义环境</strong>两种环境管理模式：</p>
<ul class="simple">
<li><p>基础环境由平台统一管理，包含大部分主流依赖包，任务运行时默认加载。通常根据系统、GPU驱动及cuda版本等要求统一管理若干个基础环境。</p></li>
<li><p>自定义环境由用户通过客户端工具按需求安装依赖，同组用户可共享。</p></li>
</ul>
<p>两种环境都可以通过 <a class="reference internal" href="../cli/ugc.html"><span class="doc std std-doc"><code class="docutils literal notranslate"><span class="pre">haienv</span></code></span></a> 工具进行管理，适用于各种任务场景。通过基础环境的构建，我们希望尽可能降低 AI 研发前期的环境构建成本。</p>
<p>更多细节请阅读<a class="reference internal" href="../guide/environment.html"><span class="doc std std-doc">环境配置</span></a>篇章内容。</p>
</section>
<section id="id16">
<h3>可视化交互<a class="headerlink" href="#id16" title="Permalink to this headline">¶</a></h3>
<p>开发容器额外运行了 JupyterLab 服务，支持可视化和交互式的开发方式：</p>
<p><img alt="" src="../_images/studio_screenshot1.png" /></p>
<p>更多细节请阅读 <a class="reference internal" href="studio.html"><span class="doc std std-doc">Studio</span></a> 篇章内容。</p>
</section>
<section id="id17">
<h3>数据管理<a class="headerlink" href="#id17" title="Permalink to this headline">¶</a></h3>
<p>部署 HAI Platform 的集群会将训练数据存储在文件系统中。训练中的任务会实时从文件系统中读取数据，流转到计算节点。这里，优良的文件系统将是提高 GPU 利用率，进而提升集群整体效率的决定性因素之一。</p>
<p>为了最大化 GPU 利用率，降低数据读取的成本，幻方 AI 自研了<a class="reference external" href="https://www.high-flyer.cn/blog/3fs/">高速文件系统 3FS</a>，其专门针对深度学习训练中样本读取的场景进行优化，能支持计算和存储节点间 7TB/s 以上的数据吞吐。3FS 需要用户将原始数据中较多的小文件聚合成大文件进行存储，推荐使用<a class="reference external" href="https://www.high-flyer.cn/blog/ffrecord/"> FFRecord 格式规范</a>来聚合文件。</p>
</section>
</section>
</section>


              </article>
              
            </div>
            <footer>
  
  <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
    
    <a href="install.html" class="btn btn-neutral float-right" title="安装与设置" accesskey="n"
      rel="next">Next <img src="../_static/images/chevron-right-blue.svg"
        class="next-page"></a>
    
    
    <a href="../index.html" class="btn btn-neutral" title="欢迎来到 HAI Platform 官方文档" accesskey="p"
      rel="prev"><img src="../_static/images/chevron-right-blue.svg" class="previous-page"> Previous</a>
    
  </div>
  

  <hr>

  <div role="contentinfo">
    <p>
      &copy; Copyright 2023, High-Flyer.

    </p>
  </div>
  
  <div>
    Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a
      href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the
      Docs</a>.
  </div>
   

</footer>
          </div>
        </div>

        <div class="pytorch-content-right" id="pytorch-content-right">
          <div class="pytorch-right-menu" id="pytorch-right-menu">
            <div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
              <ul>
<li><a class="reference internal" href="#">基本介绍</a><ul>
<li><a class="reference internal" href="#id2">产品架构</a></li>
<li><a class="reference internal" href="#id3">核心概念</a><ul>
<li><a class="reference internal" href="#id4">资源管理</a><ul>
<li><a class="reference internal" href="#id5">计算资源</a></li>
<li><a class="reference internal" href="#id6">存储资源</a></li>
</ul>
</li>
<li><a class="reference internal" href="#id7">任务调度</a><ul>
<li><a class="reference internal" href="#id8">任务</a></li>
<li><a class="reference internal" href="#id9">分时调度</a></li>
</ul>
</li>
<li><a class="reference internal" href="#id10">用户管理</a><ul>
<li><a class="reference internal" href="#id11">用户配额</a></li>
<li><a class="reference internal" href="#id12">用户组</a></li>
<li><a class="reference internal" href="#id13">管理员权限</a></li>
</ul>
</li>
<li><a class="reference internal" href="#id14">工作区</a></li>
<li><a class="reference internal" href="#id15">环境管理</a></li>
<li><a class="reference internal" href="#id16">可视化交互</a></li>
<li><a class="reference internal" href="#id17">数据管理</a></li>
</ul>
</li>
</ul>
</li>
</ul>

            </div>
          </div>
        </div>
    </section>
  </div>

  


  

  
  <script type="text/javascript" id="documentation_options" data-url_root="../"
    src="../_static/documentation_options.js"></script>
  <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
  <script src="../_static/jquery.js"></script>
  <script src="../_static/underscore.js"></script>
  <script src="../_static/doctools.js"></script>
  <script src="../_static/js/vue.global.prod.js"></script>
  <script src="../_static/js/element-plus.full.js"></script>
  

  

  <script type="text/javascript" src="../_static/js/vendor/popper.min.js"></script>
  <script type="text/javascript" src="../_static/js/vendor/bootstrap.min.js"></script>
  <script src="../_static/external/list.min.js"></script>
  <script type="text/javascript" src="../_static/js/theme.js"></script>

  <script type="text/javascript">
    jQuery(function () {
      SphinxRtdTheme.Navigation.enable(true);
      });
  </script> 

  <!-- Begin Footer -->

  <!-- <div class="container-fluid docs-tutorials-resources" id="docs-tutorials-resources">
  </div> -->

  <!-- End Footer -->

  <!-- Begin Mobile Menu -->

  <div class="mobile-main-menu">
    <div class="container-fluid">
      <div class="container">
        <div class="mobile-main-menu-header-container">
          <a class="header-logo" href="" aria-label="OpenMMLab"></a>
          <a class="main-menu-close-button" href="#" data-behavior="close-mobile-menu"></a>
        </div>
      </div>
    </div>

    <div class="mobile-main-menu-links-container">
      <div class="main-menu">
        <ul>
      </div>
    </div>
  </div>

  <!-- End Mobile Menu -->

  <script type="text/javascript" src="../_static/js/vendor/anchor.min.js"></script>

  <script type="text/javascript">
    $(document).ready(function () {
      mobileMenu.bind();
      mobileTOC.bind();
      pytorchAnchors.bind();
      sideMenus.bind();
      scrollToAnchor.bind();
      highlightNavigation.bind();
      mainMenuDropdown.bind();
      filterTags.bind();

      // Add class to links that have code blocks, since we cannot create links in code blocks
      $("article.pytorch-article a span.pre").each(function (e) {
        $(this).closest("a").addClass("has-code");
      });
    })
  </script>
</body>

</html>